# 150802 split Libtier15 COi tiermix reads (based on splitreads_ins_v11.R)
# changes 160707 - gz
setwd("/Volumes/NGS 2TB/160707_Biomass/L001_demultiplexed")

readL <- 250


file1 <- c("../Lipbp-2015_S1_L001_R1_001.fastq.gz")
file2 <- c("../Lipbp-2015_S1_L001_R2_001.fastq.gz")


barcodes <- read.csv("../in_line_barcodes_v1.csv", stringsAsFactors=F)
combos <- read.csv("../BP_rerun_v1 L001.csv", stringsAsFactors=F)


savesequ <- function(file_name1="N_debres_r1.txt", file_name2="N_debres_r2.txt", rm1=0, rm2=0){
if(length(A)>0){
rm1 <- rm1+1 # remove nucleotides
rm2 <- rm2+1
exp_sequ1 <- data1[sort(c(A*4, A*4-1, A*4-2, A*4-3))]
exp_sequ1[seq(2, length(exp_sequ1), 4)] <- substr(exp_sequ1[seq(2, length(exp_sequ1), 4)], rm1, readL)
exp_sequ1[seq(4, length(exp_sequ1), 4)] <- substr(exp_sequ1[seq(4, length(exp_sequ1), 4)], rm1, readL)

exp_sequ2 <- data2[sort(c(A*4, A*4-1, A*4-2, A*4-3))]
exp_sequ2[seq(2, length(exp_sequ2), 4)] <- substr(exp_sequ2[seq(2, length(exp_sequ2), 4)], rm2, readL)
exp_sequ2[seq(4, length(exp_sequ2), 4)] <- substr(exp_sequ2[seq(4, length(exp_sequ2), 4)], rm2, readL)

cat(exp_sequ1, file= file_name1, append=T, sep="\n")
cat(exp_sequ2, file= file_name2, append=T, sep="\n")}
}

i <- 1
chunk <- 100000*4

for (i in 1:length(file1)){

con1 <- gzfile(file1[i], "rt") # read stuff
con2 <- gzfile(file2[i], "rt") # read stuff


mytime <- Sys.time()
repeat {
	data1 <- readLines(con1, chunk) # read chunck
	data2 <- readLines(con2, chunk) # read chunck

if (!length(data1)) break

sequ1 <- data1[seq(2, chunk, 4)]
sequ2 <- data2[seq(2, chunk, 4)]


sequ1 <- substr(sequ1, 1, 5)
sequ2 <- substr(sequ2, 1, 5)

match_sequ1 <- match(sequ1, barcodes$barcode, nomatch=0)
match_sequ2 <- match(sequ2, barcodes$barcode, nomatch=0)

barcodeID <- c("NA", barcodes$ID)

tags <- paste(barcodeID[match_sequ1+1], barcodeID[match_sequ2+1], sep="_")
match_sequ <- match(tags, combos$ID)

#plot(table(match_sequ))


rm1 <- as.numeric(sub("B..(.)_.*", "\\1", combos$ID))
rm2 <- as.numeric(sub("B..._...(.)*", "\\1", combos$ID))

#rm1 <- c(0,0,0,0,0,0,0,0,0,0) # keep adapters option
#rm2 <- c(0,0,0,0,0,0,0,0,0,0)


for (k in 1:40){ # save all 40 tag combinations
A <- which(match_sequ==k)
if(length(A)>0){savesequ(combos$File1[k], combos$File2[k], rm1[k], rm2[k])}
}
A <- which(is.na(match_sequ))
if(length(A)>0){savesequ("N_debres_r1.txt", "N_debres_r2.txt", 0, 0)}

# repeat loop enede
} # looping all sequence files!

print("done in:")
print(mytime - Sys.time())
print("")
close(con1)
close(con2)

}




setwd("/Volumes/NGS 2TB/160707_Biomass")


file1 <- c("../Lipbp-2015_S1_L002_R1_001.fastq.gz")
file2 <- c("../Lipbp-2015_S1_L002_R2_001.fastq.gz")


barcodes <- read.csv("../in_line_barcodes_v1.csv", stringsAsFactors=F)
combos <- read.csv("../BP_rerun_v1 L002.csv", stringsAsFactors=F)


savesequ <- function(file_name1="N_debres_r1.txt", file_name2="N_debres_r2.txt", rm1=0, rm2=0){
if(length(A)>0){
rm1 <- rm1+1 # remove nucleotides
rm2 <- rm2+1
exp_sequ1 <- data1[sort(c(A*4, A*4-1, A*4-2, A*4-3))]
exp_sequ1[seq(2, length(exp_sequ1), 4)] <- substr(exp_sequ1[seq(2, length(exp_sequ1), 4)], rm1, readL)
exp_sequ1[seq(4, length(exp_sequ1), 4)] <- substr(exp_sequ1[seq(4, length(exp_sequ1), 4)], rm1, readL)

exp_sequ2 <- data2[sort(c(A*4, A*4-1, A*4-2, A*4-3))]
exp_sequ2[seq(2, length(exp_sequ2), 4)] <- substr(exp_sequ2[seq(2, length(exp_sequ2), 4)], rm2, readL)
exp_sequ2[seq(4, length(exp_sequ2), 4)] <- substr(exp_sequ2[seq(4, length(exp_sequ2), 4)], rm2, readL)

cat(exp_sequ1, file= file_name1, append=T, sep="\n")
cat(exp_sequ2, file= file_name2, append=T, sep="\n")}
}

i <- 1
chunk <- 100000*4

for (i in 1:length(file1)){

con1 <- gzfile(file1[i], "rt") # read stuff
con2 <- gzfile(file2[i], "rt") # read stuff


mytime <- Sys.time()
repeat {
	data1 <- readLines(con1, chunk) # read chunck
	data2 <- readLines(con2, chunk) # read chunck

if (!length(data1)) break

sequ1 <- data1[seq(2, chunk, 4)]
sequ2 <- data2[seq(2, chunk, 4)]


sequ1 <- substr(sequ1, 1, 5)
sequ2 <- substr(sequ2, 1, 5)

match_sequ1 <- match(sequ1, barcodes$barcode, nomatch=0)
match_sequ2 <- match(sequ2, barcodes$barcode, nomatch=0)

barcodeID <- c("NA", barcodes$ID)

tags <- paste(barcodeID[match_sequ1+1], barcodeID[match_sequ2+1], sep="_")
match_sequ <- match(tags, combos$ID)

#plot(table(match_sequ))


rm1 <- as.numeric(sub("B..(.)_.*", "\\1", combos$ID))
rm2 <- as.numeric(sub("B..._...(.)*", "\\1", combos$ID))


for (k in 1:40){ # save all 40 tag combinations
A <- which(match_sequ==k)
if(length(A)>0){savesequ(combos$File1[k], combos$File2[k], rm1[k], rm2[k])}
}
A <- which(is.na(match_sequ))
if(length(A)>0){savesequ("N_debres_r1.txt", "N_debres_r2.txt", 0, 0)}

# repeat loop enede
} # looping all sequence files!

print("done in:")
print(mytime - Sys.time())
print("")
close(con1)
close(con2)

}

